%matplotlib inline
import datasets
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from IPython.display import Audio as ipAudio
import seaborn as sns
# loading datasets
def load_lang_df(lang):
metadata_file = f"../data/{lang}_asr_corpus/data/train.jsonl"
df = pd.read_json(metadata_file, lines=True, orient="records")
df.name=f"{lang} dataset"
return df
tamil_dataset = load_lang_df("tamil")
telugu_dataset = load_lang_df("telugu")
kannada_dataset = load_lang_df("kannada")
malayalam_dataset = load_lang_df("malayalam")
# total number of records
print(f"There are {tamil_dataset.shape[0]} records in the {tamil_dataset.name}")
print(f"There are {telugu_dataset.shape[0]} records in the {telugu_dataset.name}")
print(f"There are {kannada_dataset.shape[0]} records in the {kannada_dataset.name}")
print(f"There are {malayalam_dataset.shape[0]} records in the {malayalam_dataset.name}")
There are 682929 records in the tamil dataset There are 209270 records in the telugu dataset There are 172733 records in the kannada dataset There are 6671 records in the malayalam dataset
Each datasets has different number of records. With the Tamil language having the highest number of records and the malayalam language having the least number of records.
# view sample of data from each dataset
display(tamil_dataset.head())
display(telugu_dataset.head())
display(kannada_dataset.head())
display(malayalam_dataset.head())
| path | sentence | length | |
|---|---|---|---|
| 0 | train/jQkLqTQW4HR3nqPswuTikn.mp3 | அன்று முதல் இவர் என அழைக்க பட்டார் | 3.0 |
| 1 | train/4D8ijwiKyDo8XcMmxtyzLZ.mp3 | அம்மா வேல பாக்குறாங்க அந்த சம்பளம் எண்ணோட இது... | 3.0 |
| 2 | train/VzXxG3jtnEDS6Ms5MFTr5J.mp3 | திருப்பூர்ல இருந்து பேசறீங்க ஓகே சார் ஜாப் பண்... | 3.0 |
| 3 | train/dyuFe9oMCiGf8pEYTmKwKs.mp3 | அதிகபட்சம் ரூ ஐம்பதாயிரத்திற்கும் மேல் டெபாசிட... | 3.0 |
| 4 | train/LZtuVLEpVYb8iGpQzrdxi4.mp3 | நம்மள மாரி ஒரு நூறு பேரு கஸ்டமர் என்ன பண்ணிருப... | 3.0 |
| path | sentence | length | |
|---|---|---|---|
| 0 | train/AbjyJhosm6fizLcjUFfXaT.mp3 | దాన్ని సైనిక శక్తి ద్వారా తీసుకోలేరు | 3.0 |
| 1 | train/Lmei9y2siXzFECrPcfvF2A.mp3 | మరి అన్న సిస్టం తో తలపడతారు | 3.0 |
| 2 | train/ZWfroY4Tg6Qe3ctQUdXYwv.mp3 | విడదీయరాని అంతర్భాగమని నొక్కి చెప్పారు | 3.0 |
| 3 | train/SiHwZzoHAXRtxWb52suehy.mp3 | మన దేశానికి చెందిన భజరంగ్ పూనియా | 3.0 |
| 4 | train/gkZwGDpQncY9t5qYDGKgpx.mp3 | అప్పుడప్పుడు | 3.0 |
| path | sentence | length | |
|---|---|---|---|
| 0 | train/jyEJg8nBjnbkoEneRffsYd.mp3 | ಒಂದು ರೀತಿಯ ರಾಸಾಯನಿಕ ಉತ್ಪತ್ತಿಯಾಗುತ್ತದೆ | 3.0 |
| 1 | train/nKFq47ayVrjFdbwZHjfhXL.mp3 | ವೈದ್ಯಕೀಯ ಆರೈಕೆಯಲ್ಲಿದ್ದಾರೆ ಎಂದು ತಿಳಿಸಿದರು | 3.0 |
| 2 | train/oPtqEqXDAHwNMdY5r8FJ4p.mp3 | ವಿದ್ಯಾರ್ಥಿಗಳೇ ನಿಮ್ಮ ಉತ್ತರವನ್ನು ನೋಡೋಣ | 3.0 |
| 3 | train/Yn3bwtW3BVrx4fjjzdSUCH.mp3 | ಕವನ ಸಂಕಲನ ಯಾವುದು ಮೊದಲ ಕವನ ಸಂಕಲನ ಯಾವುದು | 3.0 |
| 4 | train/SHTXDzpscMkR6HBpXjZDYF.mp3 | ಶಿವಮೊಗ್ಗದಲ್ಲಿ ಇಂದು ಸುದ್ದಿಗಾರರ ಜೊತೆ ಮಾತನಾಡಿದ ಅವರು | 3.0 |
| path | sentence | length | |
|---|---|---|---|
| 0 | train/i29JGozoiWEkqKrzAL4Dd5.mp3 | ഭയാനക പ്രതിസന്ധി | 3.0 |
| 1 | train/iGzPqKPmLBDzYbjEfkqeqR.mp3 | പ്രകാശം എപ്പോഴും നല്ലത് ആണല്ലോ | 3.0 |
| 2 | train/Ze7U6rvcC8gqRiJY8uvxWC.mp3 | ഇപ്പോൾ ഉദാഹരണമായിട്ട് നമ്മൾ രാവിലെ | 3.0 |
| 3 | train/MYoRSKzKXWc6uf2FfAS3Uy.mp3 | ടൂറിസം കേന്ദ്രം ഓൺലൈൻ വയനാട് | 3.0 |
| 4 | train/XgiGQi8ECz8AELKaN5UC2A.mp3 | ഈ പ്രവചിക്കാവുന്ന കടകം | 3.0 |
As seen each dataset has three columns. The path column relating to the relative audio path, the sentence column contains the transcript and length column contains the length of the audio in seconds
# Statistics for the Audio Duration
def display_audio_stats(df):
audio_stats = pd.DataFrame(df["length"].map(lambda x: x/3600).describe()).T
print(f"Audio Statistics in Hours for {df.name}")
display(audio_stats)
display_audio_stats(tamil_dataset)
display_audio_stats(telugu_dataset)
display_audio_stats(kannada_dataset)
display_audio_stats(malayalam_dataset)
Audio Statistics in Hours for tamil dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| length | 682929.0 | 0.001894 | 0.000819 | 0.000833 | 0.001256 | 0.001675 | 0.00237 | 0.008325 |
Audio Statistics in Hours for telugu dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| length | 209270.0 | 0.00185 | 0.000835 | 0.000833 | 0.001175 | 0.001608 | 0.002367 | 0.004167 |
Audio Statistics in Hours for kannada dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| length | 172733.0 | 0.002077 | 0.000893 | 0.000833 | 0.001333 | 0.001883 | 0.002717 | 0.007443 |
Audio Statistics in Hours for malayalam dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| length | 6671.0 | 0.001518 | 0.000642 | 0.000833 | 0.00105 | 0.001333 | 0.001742 | 0.004167 |
def display_audio_hists(dfs):
fig, axs= plt.subplots(ncols=2,nrows=2, figsize=(20, 15))
axs = axs.ravel()
for i, df in enumerate(dfs):
# audio_hours = df["length"]
# audio_hours.plot.hist(bins=50, ax=axs[i])
sns.histplot(data=df, x="length", kde=True, ax=axs[i])
sns.despine(top=True, right=True)
axs[i].set_xlabel("Duration in Seconds")
axs[i].set_title(f"Histogram of audio duation for {df.name}", size=14)
plt.show()
display_audio_hists(
[tamil_dataset,telugu_dataset,
kannada_dataset,malayalam_dataset
]
)
All the datasets are mostly left skewed in their distributions with most of the data contain short audio of less than 15 seconds. This will be useful when we truncate and pad our data for the processing through our model.
def plot_audio_violins(datasets):
item_dfs = []
for dataset in datasets:
item_df = pd.DataFrame(dataset["length"].tolist(), columns=["duration(s)"])
item_df["dataset"] = dataset.name
item_dfs.append(item_df)
item_dfs = pd.concat(item_dfs, axis=0)
plt.figure(figsize=(18, 10))
sns.violinplot(data=item_dfs, y="duration(s)", x="dataset")
plt.title("Distribution of Audio duration across datasets", size=14)
sns.despine(top=True, right=True)
plt.show()
plot_audio_violins([tamil_dataset,telugu_dataset,
kannada_dataset,malayalam_dataset
]
)
# Statistics for the Audio Duration
def display_transcription_stats(df):
char_stats = pd.DataFrame(df["sentence"].map(len).describe()).T
char_stats.index=["Character"]
word_stats = pd.DataFrame(df["sentence"].str.split().map(len).describe()).T
word_stats.index=["Word"]
print(f"Transcription Statistics for {df.name}")
stats = pd.concat([char_stats, word_stats])
display(stats)
display_transcription_stats(tamil_dataset)
display_transcription_stats(telugu_dataset)
display_transcription_stats(kannada_dataset)
display_transcription_stats(malayalam_dataset)
Transcription Statistics for tamil dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Character | 682929.0 | 103.259850 | 54.817564 | 2.0 | 64.0 | 88.0 | 132.0 | 524.0 |
| Word | 682929.0 | 11.741752 | 6.047904 | 1.0 | 7.0 | 10.0 | 15.0 | 62.0 |
Transcription Statistics for telugu dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Character | 209270.0 | 81.527051 | 44.311727 | 4.0 | 49.0 | 72.0 | 107.0 | 289.0 |
| Word | 209270.0 | 10.109251 | 5.297302 | 1.0 | 6.0 | 9.0 | 13.0 | 36.0 |
Transcription Statistics for kannada dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Character | 172733.0 | 75.106627 | 42.335828 | 2.0 | 44.0 | 68.0 | 100.0 | 249.0 |
| Word | 172733.0 | 9.217712 | 5.084692 | 1.0 | 5.0 | 8.0 | 12.0 | 31.0 |
Transcription Statistics for malayalam dataset
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Character | 6671.0 | 71.999550 | 39.473064 | 4.0 | 48.0 | 64.0 | 85.0 | 303.0 |
| Word | 6671.0 | 7.987258 | 4.484889 | 1.0 | 5.0 | 7.0 | 9.0 | 36.0 |
def display_transcription_hists(dfs, stat_type="word"):
fig, axs= plt.subplots(ncols=2,nrows=2, figsize=(20, 15))
axs = axs.ravel()
for i, df in enumerate(dfs):
df_copy = df.copy()
if stat_type == "word":
df_copy["t_length"] = df_copy["sentence"].str.split().map(len)
else:
df_copy["t_length"] = df_copy["sentence"].map(len)
sns.histplot(data=df_copy, x="t_length", kde=True, ax=axs[i])
sns.despine(top=True, right=True)
axs[i].set_xlabel(f"Length of transcription in {stat_type}s")
axs[i].set_title(f"Histogram of transcription length for {df.name}", size=14)
plt.show()
display_transcription_hists((tamil_dataset,telugu_dataset,
kannada_dataset,malayalam_dataset))
display_transcription_hists((tamil_dataset,telugu_dataset,
kannada_dataset,malayalam_dataset), stat_type="character")
def plot_transcript_violins(datasets, stat_type="word"):
item_dfs = []
for dataset in datasets:
dataset_copy = dataset.copy()
if stat_type == "word":
dataset_copy["t_length"] = dataset_copy["sentence"].str.split().map(len)
else:
dataset_copy["t_length"] = dataset_copy["sentence"].map(len)
item_df = pd.DataFrame(dataset_copy["t_length"].tolist(), columns=[f"Transcript Length({stat_type})"])
item_df["dataset"] = dataset.name
item_dfs.append(item_df)
item_dfs = pd.concat(item_dfs, axis=0)
plt.figure(figsize=(18, 10))
sns.violinplot(data=item_dfs, y=f"Transcript Length({stat_type})", x="dataset")
plt.title("Distribution of Transcript lengths duration across datasets", size=14)
sns.despine(top=True, right=True)
plt.show()
plot_transcript_violins((tamil_dataset,telugu_dataset,
kannada_dataset,malayalam_dataset))
plot_transcript_violins((tamil_dataset,telugu_dataset,
kannada_dataset,malayalam_dataset), "characters")
# Audio and Wav forms
def plot_sample_audio_from_dataset(df):
lang = df.name.split()[0]
dataset_dir = f"../data/{lang}_asr_corpus/"
dataset = datasets.load_dataset(dataset_dir, split="train")
dataset = dataset.select(range(4))
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(20,10))
ax = ax.ravel()
for i, item in enumerate(dataset):
display(
ipAudio(item["audio"]["array"], rate=item["audio"]["sampling_rate"]),
metadata={"transcript": item["sentence"]})
ax[i].plot(range(len(item["audio"]["array"])), item["audio"]["array"])
ax[i].set_xlabel("Time")
ax[i].set_ylabel("Amplitude")
plt.suptitle(f"Waveforms of samples from {df.name}", size=14)
sns.despine(top=True, right=True)
plt.show()
for dataset in (tamil_dataset, malayalam_dataset, telugu_dataset, kannada_dataset):
plot_sample_audio_from_dataset(dataset)
Found cached dataset tamil_asr_corpus (/media/mugan/data/.cache/huggingface/tamil_asr_corpus/default/1.1.0/f76ac20a4e5d50f03062059f7e12e4b59211896d543edba0ab2503b8e74996b0)
Found cached dataset malayalam_asr_corpus (/media/mugan/data/.cache/huggingface/malayalam_asr_corpus/default/1.1.0/9f936af6b0d95b92954b2a4c3351393722f0452db11cd15bb0bc3374c54bdd77)
Found cached dataset telugu_asr_corpus (/media/mugan/data/.cache/huggingface/telugu_asr_corpus/default/1.1.0/b107eea8e2aaf2216292228368c5a07b47e307fa0e08fd77dd0cf113f72c87dc)
Found cached dataset kannada_asr_corpus (/media/mugan/data/.cache/huggingface/kannada_asr_corpus/default/1.1.0/de4ccc7ad1b5663213e62c29c696d3a848d9b3596c46be431a03d3649d14f8a7)